library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.6
## ✔ tidyr   0.8.1     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
knitr::opts_knit$set(root.dir = "/Users/Jeremy/Documents/R/Modelling-Elections")
load("./Clean-Data/data_mod.rda")
load("./Clean-Data/var_imp_all.rda")
load("./Clean-Data/std_coef_all.rda")

Superset

Determined by Akaike weights (top 5)

# Top 5 variables
superset_AW_vars <- var_imp_all %>% 
  group_by(year) %>% 
  top_n(n = 5, wt = sum_w) %>%
  ungroup() %>% 
  select(var, varname) %>% 
  unique()

Model-averaged coefficients top 5

# Top 5 variables
superset_SC_vars <- std_coef_all %>% 
  group_by(year) %>% 
  top_n(n = 5, wt = abs(avg_coef)) %>%
  ungroup() %>% 
  select(var, varname)  %>% 
  unique()

Union of the two

superset <- bind_rows(superset_AW_vars, superset_SC_vars) %>% group_by(varname) %>% unique()

Regression

Run a regression each year using this superset

fit_ss_16 <- lm(Perc_LNP ~ ., 
                data = data_mod %>% 
                  filter(year == "2016") %>% 
                  select(Perc_LNP,  (superset$varname %>% as.character)))

fit_ss_13 <- lm(Perc_LNP ~ ., 
                data = data_mod %>% 
                  filter(year == "2013") %>% 
                  select(Perc_LNP,  (superset$varname %>% as.character)))

fit_ss_10 <- lm(Perc_LNP ~ ., 
                data = data_mod %>% 
                  filter(year == "2010") %>% 
                  select(Perc_LNP,  (superset$varname %>% as.character)))

fit_ss_07 <- lm(Perc_LNP ~ ., 
                data = data_mod %>% 
                  filter(year == "2007") %>% 
                  select(Perc_LNP,  (superset$varname %>% as.character)))

fit_ss_04 <- lm(Perc_LNP ~ ., 
                data = data_mod %>% 
                  filter(year == "2004") %>% 
                  select(Perc_LNP,  (superset$varname %>% as.character)))

fit_ss_01 <- lm(Perc_LNP ~ ., 
                data = data_mod %>% 
                  filter(year == "2001") %>% 
                  select(Perc_LNP, (superset$varname %>% as.character)))

# Extracting coefficients
ss_coef = data.frame(varname = names(fit_ss_01$coefficients),
                     year = as.character(rep(seq(2001, 2016, by = 3), 
                                             each = length(names(fit_ss_01$coefficients)))),
                     coef =
                       c(fit_ss_01$coefficients, 
                         fit_ss_04$coefficients, 
                         fit_ss_07$coefficients, 
                         fit_ss_10$coefficients, 
                         fit_ss_13$coefficients, 
                         fit_ss_16$coefficients))

# Point plot of coefficients for each variable, coloured by year
ss_coef %>% 
  filter(varname != "(Intercept)") %>% 
  ggplot(aes(x=varname, y=coef)) +
  geom_point(aes(col = year), size = 2.5, alpha = 0.8) + 
  theme(axis.text.x = element_text(angle = 60, hjust=1, size = 12))

Plotting coefficients over time

The plot shows the significance (T test) of each variable, across years.

## Create df of coefficients
library(broom)
all_coefs <- bind_rows(tidy(fit_ss_16) %>% mutate(year = "2016"), 
                       tidy(fit_ss_13) %>% mutate(year = "2013"), 
                       tidy(fit_ss_10) %>% mutate(year = "2010"), 
                       tidy(fit_ss_07) %>% mutate(year = "2007"), 
                       tidy(fit_ss_04) %>% mutate(year = "2004"), 
                       tidy(fit_ss_01) %>% mutate(year = "2001")) %>% 
  mutate(upper95 = estimate + 1.96*std.error, lower95 = estimate - 1.96*std.error)

# Plot 95% confidence intervals for each coefficient
all_coefs %>% 
  filter(term != "(Intercept)") %>% 
  ggplot(aes(x = year, y = estimate)) + 
  geom_point() + geom_errorbar(aes(ymin = lower95, ymax = upper95), width = 0.25) + 
  geom_hline(aes(yintercept = 0), alpha = 0.5) + 
  facet_wrap(~term) + 
  theme(axis.text.x = element_text(angle = 60, hjust=1, size = 12))

Or alternatively use an F test (drop1) to test significance.

all_sig <- bind_rows(tidy(drop1(fit_ss_01, test = "F")) %>% mutate(year = "2001"),
                     tidy(drop1(fit_ss_04, test = "F")) %>% mutate(year = "2004"),
                     tidy(drop1(fit_ss_07, test = "F")) %>% mutate(year = "2007"),
                     tidy(drop1(fit_ss_10, test = "F")) %>% mutate(year = "2010"),
                     tidy(drop1(fit_ss_13, test = "F")) %>% mutate(year = "2013"),
                     tidy(drop1(fit_ss_16, test = "F")) %>% mutate(year = "2016")) %>% 
  rename(pval_F = p.value) %>% select(term, pval_F, year)
## Warning in tidy.anova(drop1(fit_ss_01, test = "F")): The following column
## names in ANOVA output were not recognized or transformed: AIC
## Warning in tidy.anova(drop1(fit_ss_04, test = "F")): The following column
## names in ANOVA output were not recognized or transformed: AIC
## Warning in tidy.anova(drop1(fit_ss_07, test = "F")): The following column
## names in ANOVA output were not recognized or transformed: AIC
## Warning in tidy.anova(drop1(fit_ss_10, test = "F")): The following column
## names in ANOVA output were not recognized or transformed: AIC
## Warning in tidy.anova(drop1(fit_ss_13, test = "F")): The following column
## names in ANOVA output were not recognized or transformed: AIC
## Warning in tidy.anova(drop1(fit_ss_16, test = "F")): The following column
## names in ANOVA output were not recognized or transformed: AIC
all_coefs_sig <- left_join(all_coefs, all_sig, by = c("term", "year"))                   

# Plot
all_coefs_sig %>% 
  filter(term != "(Intercept)") %>% 
  ggplot(aes(x = year, y = estimate)) + 
  geom_hline(aes(yintercept = 0), alpha = 0.5) + 
  geom_line(aes(group = term)) + 
  geom_point(aes(col = (pval_F < 0.05))) + 
  scale_color_manual(values = c("grey50", "magenta")) + 
  facet_wrap(~term) + 
  theme(axis.text.x = element_text(angle = 60, hjust=1, size = 12))

Diagnostics

aug_all <- bind_rows(augment(fit_ss_01) %>% mutate(year = "2001"), 
                     augment(fit_ss_04) %>% mutate(year = "2004"), 
                     augment(fit_ss_07) %>% mutate(year = "2007"), 
                     augment(fit_ss_10) %>% mutate(year = "2010"), 
                     augment(fit_ss_13) %>% mutate(year = "2013"), 
                     augment(fit_ss_16) %>% mutate(year = "2016"))

Residual heteroskedasticity

# Response vs fitted
aug_all %>% 
  ggplot(aes(x=.fitted, y=.resid)) + 
  geom_point() + geom_smooth() +
  facet_wrap(~year)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

No signs of heteroskedasticity.

Residual normality

# Histogram
aug_all %>% 
  ggplot(aes(x = .resid)) + 
  geom_histogram() +
  facet_wrap(~year)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# QQ plots
aug_all %>% 
  ggplot(aes(sample = .resid)) + 
  geom_qq() + geom_qq_line() + 
  facet_wrap(~year)

Histograms don’t look smooth. Residuals look good. Possibly fat lower tails in 2004, 2007 and 2010.

Non-linearities: Residuals against predictors

aug_all %>%
  filter(year == "2016") %>% 
  gather(key = "variable", value = "value", -c(starts_with("."), Perc_LNP, year)) %>% 
  ggplot(aes(x = value, y = .resid)) +
  geom_point() + geom_smooth() + facet_wrap(~variable, scales = "free_x")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

No obvious non-linearities in 2001, 2004, 2007, 2010, 2013, 2016. Perhaps could add a quadratic term for NoReligion, to capture the drop seen in 2013 and 2016.

Residuals vs Predictors not in model

vars_not_in <- names(data_mod)[which(!names(data_mod) %in% superset$varname)]

aug_all %>% bind_cols(data_mod %>% filter(!is.na(Perc_LNP))) %>% 
  filter(year == "2001") %>% 
  select(starts_with("."), c(vars_not_in), -c(Swing, year, Election_Division)) %>% 
  gather(key = "variable", value = "value", -c(starts_with("."), Perc_LNP)) %>%
  ggplot(aes(x = value, y = .resid)) +
  geom_point() + geom_smooth(level = 0.99) + facet_wrap(~variable, scales = "free")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

2016: Anglican perhaps 2013: Anglican perhaps 2010: Anglican perhaps 2007: None 2004: None 2001: None

Influence

# Cook distance
aug_all %>% 
  ggplot(aes(x=.cooksd)) + geom_dotplot() + facet_wrap(~year)
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

aug_all %>% bind_cols(data_mod %>% filter(!is.na(Perc_LNP))) %>% select(-ends_with("1")) %>%  filter(.cooksd > 0.10)
##   Perc_LNP  MedianAge Born_SE_Europe Extractive ManagerAdminClericalSales
## 1    28.51 -0.9286047     2.48811857 -0.6877925                -2.8907611
## 2    33.58 -1.2237522     0.02743685 -0.7395242                 0.3927632
## 3    30.50 -1.6106212    -0.15100533 -0.7753107                 0.3901356
## 4    37.52 -0.4159843    -0.77178334  2.4508072                -2.2928993
##     RentLoan NoReligion OneParent_House    Born_UK       Educ
## 1 -0.4754758  -1.950273       2.5019742 -1.1165372 -0.6094315
## 2  2.5124411   2.026871      -0.9977573  0.4508997  2.7357361
## 3  2.5661179   2.160849      -1.3891426  0.4789316  2.7526920
## 4 -0.4802068  -1.108718       0.5997463 -0.9133097 -1.2095348
##   CurrentlyStudying OtherLanguageHome Unemployed  .fitted  .se.fit
## 1         0.8564924         3.5299382  3.2813214 16.18128 2.584872
## 2         4.2511675         0.7032289 -0.4553721 45.52587 2.839102
## 3         4.2449600         0.8215534 -0.1367056 44.77817 2.659725
## 4        -0.1833518        -0.9757471 -0.1506388 56.06694 2.471739
##      .resid      .hat   .sigma   .cooksd .std.resid year Election_Division
## 1  12.32872 0.1971121 5.716076 0.1054706   2.363242 2001            FOWLER
## 2 -11.94587 0.2292720 5.835581 0.1205120  -2.294890 2004            SYDNEY
## 3 -14.27817 0.2330428 5.350163 0.2046715  -2.959157 2007            SYDNEY
## 4 -18.54694 0.1410645 6.378373 0.1168182  -3.040874 2010            HUNTER
##   Swing LFParticipation    Catholic   Buddhism       Islam    Judaism
## 1  4.51      -2.0348616  0.26611658  8.2861195  1.68862158 -0.2493101
## 2 -1.38       0.5698251 -0.48116035  1.3844471 -0.02281318  0.5718827
## 3 -2.12      -0.1180531 -0.61632533  1.7315766 -0.07743925  0.4087103
## 4 -3.20      -0.1670680  0.05220012 -0.7254575 -0.57994798 -0.2430100
##   Indigenous AusCitizen Born_MidEast  Born_Asia Transformative
## 1 -0.2874485  -1.059928   2.76102579  5.3510962      2.7871713
## 2 -0.2374973  -5.284247  -0.05091008  1.6220209     -2.0753331
## 3 -0.2699952  -4.938373  -0.06636706  2.0630601     -2.2394845
## 4  0.3556645   1.124934  -0.45922967 -0.8792213     -0.1209952
##   Distributive SocialServ   Anglican OtherChrist BornElsewhere
## 1    0.5697183 -2.3099607 -1.8054990  -0.9686069      2.893666
## 2   -2.0042337 -0.2302874 -0.7997626  -1.7545670      2.059270
## 3   -1.6977589 -0.4807480 -0.9193700  -1.8326709      1.979830
## 4   -0.8514647 -1.7003698  2.3035007  -0.7155670     -1.225617
##   FamHouseSize PropertyMarr    Incomes
## 1    1.5650333  -0.78395994 -1.1220007
## 2   -3.7473441  -4.40221530  2.3808890
## 3   -3.5189607  -4.23530827  2.1894082
## 4    0.3958646   0.06025234 -0.3080689

Only four electorates across all elections appear to have large influence (cook distance > 0.1). All of these strongly supported Labor with \(TPP < 0.38\). Sydney is one in 2004 and 2007, due to its very high proportion of people currently studying, along with high rental/loan payments, education and high proportion of NoReligion. Fowler in 2001 (high unemployement, other language at home and one parent house) and Hunter in 2010 (low educ, high extractive).

Comparing estimated effects with raw plots

If relationship appears to be the same, then variable is not masked by any of the others. If the relationship is not, then there is some dependence.

Go through variables one by one, and examine effects

How coefficients change when we drop a variable

If the coefficient changes significantly, then there is some dependency

drop_var <- function(fit) {
  dat <- fit$model
for (j in 2:ncol(dat)) {
  omitted_name <- names(dat)[j]
  dat_mod <- dat[, -j]
  mod <- lm(Perc_LNP ~ ., data = dat_mod)
  mod_out <- tidy(mod) %>% 
    mutate(omitted_var = omitted_name) %>% 
    left_join(tidy(fit) %>% select(term, estimate) %>% rename(full_estimate = estimate), by = "term") %>% 
    mutate(diff_estimate = full_estimate - estimate)
  
  if (j == 2) {
    mod_all <- mod_out
  } else {
    mod_all <- rbind(mod_all, mod_out)
  }
  
    
}
  
  mod_all <- mod_all %>% 
    arrange(-abs(diff_estimate)) %>% 
    filter(term != "(Intercept)")
  
  return(mod_all)
}


change_coef_16 <- drop_var(fit_ss_16) %>% mutate(year = "2016")
change_coef_13 <- drop_var(fit_ss_13) %>% mutate(year = "2013")
change_coef_10 <- drop_var(fit_ss_10) %>% mutate(year = "2010")
change_coef_07 <- drop_var(fit_ss_07) %>% mutate(year = "2007")
change_coef_04 <- drop_var(fit_ss_04) %>% mutate(year = "2004")
change_coef_01 <- drop_var(fit_ss_01) %>% mutate(year = "2001")

change_coef_all <- bind_rows(change_coef_16, change_coef_13, change_coef_10, change_coef_07, change_coef_04, change_coef_01) %>% mutate(lower95 = estimate - 1.96*std.error, upper95 = estimate + 1.96*std.error)

Born in South/Eastern Europe

all_coefs %>% filter(term == "Born_SE_Europe") %>% 
  ggplot(aes(x=year, y=estimate)) + 
  geom_point() + geom_errorbar(aes(ymin = lower95, ymax = upper95), width = 0.25) + 
  geom_hline(aes(yintercept = 0), alpha = 0.5) + 
  theme(axis.text.x = element_text(angle = 60, hjust=1, size = 12)) + lims(y=c(-8,8))

## Voting against Born_SE_Europe
data_mod %>% ggplot(aes(x = Born_SE_Europe, y = Perc_LNP)) + geom_point(aes(label = Election_Division)) + geom_smooth(method = "lm") + facet_wrap(~year)
## Warning: Ignoring unknown aesthetics: label
## Warning: Removed 9 rows containing non-finite values (stat_smooth).
## Warning: Removed 9 rows containing missing values (geom_point).

Electorates with higher proportions of South or Eastern European populations are affiliated with support for Labor.

Born in the UK

all_coefs %>% filter(term == "Born_UK") %>% 
  ggplot(aes(x=year, y=estimate)) +
  geom_point() + geom_errorbar(aes(ymin = lower95, ymax = upper95), width = 0.25) + 
  geom_hline(aes(yintercept = 0), alpha = 0.5) + 
  theme(axis.text.x = element_text(angle = 60, hjust=1, size = 12)) + lims(y=c(-8,8))

## Voting against Born_SE_Europe
data_mod %>% ggplot(aes(x = Born_UK, y = Perc_LNP)) + geom_point(aes(label = Election_Division)) + geom_smooth(method = "lm") + facet_wrap(~year)
## Warning: Ignoring unknown aesthetics: label
## Warning: Removed 9 rows containing non-finite values (stat_smooth).
## Warning: Removed 9 rows containing missing values (geom_point).

## Moore
aug_all %>% bind_cols(data_mod %>% filter(!is.na(Perc_LNP))) %>% select(-ends_with("1")) %>%  filter(Election_Division == "MOORE")
##   Perc_LNP  MedianAge Born_SE_Europe  Extractive ManagerAdminClericalSales
## 1    56.04 -0.5438238     -0.4416525 -0.37568144                 1.1226597
## 2    60.83 -0.3929581     -0.4867418 -0.22329806                 0.8198340
## 3    59.17 -0.3372081     -0.4925636 -0.09597450                 0.6023632
## 4    61.19 -0.4282093     -0.4984828  0.03436916                 0.5855277
## 5    61.86 -0.1984986     -0.5314485  0.08587588                 0.6155377
## 6    61.02  0.1482080     -0.5210374  0.09294357                 0.6947601
##     RentLoan NoReligion OneParent_House  Born_UK        Educ
## 1 0.02991237  0.5763304       -1.196613 4.320695 -0.05681388
## 2 0.09830787  0.5401145       -1.188509 4.495597 -0.08454275
## 3 0.29179384  0.4682398       -1.158263 4.638739 -0.12618516
## 4 0.68913011  0.3812600       -1.160612 4.907560 -0.16046244
## 5 0.79111622  0.3441159       -1.337822 5.009481 -0.07955456
## 6 0.76189574  0.3145120       -1.477880 4.757788  0.04082498
##   CurrentlyStudying OtherLanguageHome  Unemployed  .fitted  .se.fit
## 1         1.1308396        -0.4470591 -0.54595512 60.48053 2.709834
## 2         0.6756263        -0.4731773 -1.03819495 64.28357 2.829224
## 3         0.3188550        -0.4680389 -1.37535344 63.43024 2.697508
## 4         0.4596217        -0.4620899 -1.15672711 66.50672 3.395199
## 5         0.1621151        -0.5245402 -0.67729212 68.74142 3.104583
## 6        -0.2238452        -0.5729920 -0.04205732 63.54039 3.342617
##      .resid      .hat   .sigma     .cooksd .std.resid year
## 1 -4.440535 0.2166310 5.828034 0.015796112 -0.8617270 2001
## 2 -3.453566 0.2276793 5.941539 0.009961152 -0.6627710 2004
## 3 -4.260240 0.2397107 5.513910 0.019072891 -0.8868000 2007
## 4 -5.316721 0.2661602 6.583702 0.024814054 -0.9430823 2010
## 5 -6.881418 0.2788949 5.859235 0.056531427 -1.3784648 2013
## 6 -2.520395 0.2634104 6.531884 0.005592876 -0.4509056 2016
##   Election_Division Swing LFParticipation    Catholic   Buddhism
## 1             MOORE  0.42        1.691430 -0.15523841 -0.3610543
## 2             MOORE  4.79        1.972116 -0.19816865 -0.4178540
## 3             MOORE -1.66        2.125393 -0.21046032 -0.4687588
## 4             MOORE -2.26        2.106928 -0.21950424 -0.4643023
## 5             MOORE -0.67        1.974156 -0.15476910 -0.5199252
## 6             MOORE -1.42        1.725074  0.01884079 -0.5198247
##        Islam    Judaism Indigenous AusCitizen Born_MidEast  Born_Asia
## 1 -0.2324744 -0.1643550 -0.4253132 -0.6971303   -0.2460388 -0.2530300
## 2 -0.2744377 -0.1644303 -0.4344362 -0.6648978   -0.2934932 -0.3343101
## 3 -0.3024756 -0.1628949 -0.4428027 -0.6943575   -0.3225926 -0.4003551
## 4 -0.2997532 -0.1798245 -0.4340121 -0.7167070   -0.3229389 -0.4662736
## 5 -0.3721719 -0.1908884 -0.4719055 -0.3834964   -0.3613040 -0.5480473
## 6 -0.3979793 -0.1939006 -0.5186419  0.1409623   -0.3529644 -0.5788780
##   Transformative Distributive SocialServ  Anglican OtherChrist
## 1    -0.11086556   0.01499303 0.39997834 0.7366627  -1.1071960
## 2     0.03409458  -0.36387887 0.34356563 0.7541129  -1.0752897
## 3     0.19638640  -0.62025594 0.22176088 0.7767155  -0.9758223
## 4     0.38718723  -0.66432417 0.04250528 0.8343120  -0.7907403
## 5     0.41612848  -0.75890360 0.22906001 0.8378599  -0.7115191
## 6     0.36306267  -0.79763553 0.61819455 0.7901623  -0.6364092
##   BornElsewhere FamHouseSize PropertyMarr   Incomes
## 1    0.41915404    1.0008782    1.0545004 0.8926017
## 2    0.26248866    0.8885396    1.0667044 0.9958852
## 3    0.20413572    0.8125954    1.0304361 1.1135297
## 4    0.20955263    0.9408657    0.9941103 1.3040951
## 5    0.10424849    0.8259520    1.2077459 1.3078407
## 6   -0.02268286    0.6428592    1.4921378 1.1900471

Electorates with higher proportions of people from the UK are affiliated with support for LNP across all years, however this is only significant in 2007 and 2010. The electorate Moore (WA) has the highest proportion of migrants from the UK, and has high leverage (.hat > 0.2 for all years).

Currently Studying

all_coefs %>% filter(term == "CurrentlyStudying") %>% 
  ggplot(aes(x=year, y=estimate)) +
  geom_point() + geom_errorbar(aes(ymin = lower95, ymax = upper95), width = 0.25) + 
  geom_hline(aes(yintercept = 0), alpha = 0.5) + 
  theme(axis.text.x = element_text(angle = 60, hjust=1, size = 12)) + lims(y=c(-8,8))

## Voting against CurrentlyStudying
data_mod %>% ggplot(aes(x = CurrentlyStudying, y = Perc_LNP)) + geom_point(aes(label = Election_Division)) + geom_smooth(method = "lm") + facet_wrap(~year)
## Warning: Ignoring unknown aesthetics: label
## Warning: Removed 9 rows containing non-finite values (stat_smooth).
## Warning: Removed 9 rows containing missing values (geom_point).

Inspection of the raw data indicates that electoratres with more people currently studying should have an affiliation with the Labor party. This is also what we intuitively may expect, as Labor policies have been more favorable for students, often pitching more funding for eduaction than LNP. Our model indicates otherwise. In all but 2007 and 2010, after adjusting for other socio-demographics, the proportion of the population currently studying has no significant effect on party preference.

To test for confounding, we re-fit the model with dropping a single variable, and observe the change in coefficient for CurrentlyStudying. When omitting MedianAge, the coefficient decreases by a large amount, which is not surprising given these two are negatively correlated (\(\rho = -0.683\)). Similarly, the effect of omitting higher education levels is absorbed by CurrentlyStudying, causing a decrease in estimated coefficient.

change_coef_all %>% 
  filter(term == "CurrentlyStudying") %>% 
  ggplot(aes(x=omitted_var, y=estimate)) + 
  geom_point() + geom_errorbar(aes(ymin = lower95, ymax = upper95), width = 0.25) + 
  geom_hline(aes(yintercept = full_estimate), col = "blue") + 
  geom_hline(aes(yintercept = 0), col = "grey50") + 
  facet_wrap(~year) + 
  theme(axis.text.x = element_text(angle = 60, hjust=1, size = 12)) 

cor(data_mod$MedianAge, data_mod$CurrentlyStudying)
## [1] -0.682983

Educ

all_coefs %>% filter(term == "Educ") %>% 
  ggplot(aes(x=year, y=estimate)) +
  geom_point() + geom_errorbar(aes(ymin = lower95, ymax = upper95), width = 0.25) + 
  geom_hline(aes(yintercept = 0), alpha = 0.5) + 
  theme(axis.text.x = element_text(angle = 60, hjust=1, size = 12)) + lims(y=c(-8,8))

## Voting against Educ
data_mod %>% ggplot(aes(x = CurrentlyStudying, y = Perc_LNP)) + geom_point(aes(label = Election_Division)) + geom_smooth(method = "lm") + facet_wrap(~year)
## Warning: Ignoring unknown aesthetics: label
## Warning: Removed 9 rows containing non-finite values (stat_smooth).
## Warning: Removed 9 rows containing missing values (geom_point).

Higher levels of education were associated with support for the Labor party in 2004 and 2007 elections, but were not significant in the others. We expected a significant effect in each election.

Omitting NoReligion results in a stronger estimated effect in support for Labor in 2010, 2013 and 2016.

change_coef_all %>% 
  filter(term == "Educ") %>% 
  ggplot(aes(x=omitted_var, y=estimate)) + 
  geom_point() + geom_errorbar(aes(ymin = lower95, ymax = upper95), width = 0.25) + 
  geom_hline(aes(yintercept = full_estimate), col = "blue") + 
  geom_hline(aes(yintercept = 0), col = "grey50") + 
  facet_wrap(~year) + 
  theme(axis.text.x = element_text(angle = 60, hjust=1, size = 12)) 

Extractive

all_coefs %>% filter(term == "Extractive") %>% 
  ggplot(aes(x=year, y=estimate)) +
  geom_point() + geom_errorbar(aes(ymin = lower95, ymax = upper95), width = 0.25) + 
  geom_hline(aes(yintercept = 0), alpha = 0.5) + 
  theme(axis.text.x = element_text(angle = 60, hjust=1, size = 12)) + lims(y=c(-8,8))

## Voting against Extractive
data_mod %>% ggplot(aes(x = Extractive, y = Perc_LNP)) + geom_point(aes(label = Election_Division)) + geom_smooth(method = "lm") + facet_wrap(~year)
## Warning: Ignoring unknown aesthetics: label
## Warning: Removed 9 rows containing non-finite values (stat_smooth).
## Warning: Removed 9 rows containing missing values (geom_point).

Our estimated effects are strongly positive each election, which matches our expectations based on the plots.

Median Age

all_coefs %>% filter(term == "MedianAge") %>% 
  ggplot(aes(x=year, y=estimate)) +
  geom_point() + geom_errorbar(aes(ymin = lower95, ymax = upper95), width = 0.25) + 
  geom_hline(aes(yintercept = 0), alpha = 0.5) + 
  theme(axis.text.x = element_text(angle = 60, hjust=1, size = 12)) + lims(y=c(-8,8))

## Voting against MedianAge
data_mod %>% ggplot(aes(x = MedianAge, y = Perc_LNP)) + geom_point(aes(label = Election_Division)) + geom_smooth(method = "lm") + facet_wrap(~year)
## Warning: Ignoring unknown aesthetics: label
## Warning: Removed 9 rows containing non-finite values (stat_smooth).
## Warning: Removed 9 rows containing missing values (geom_point).

Median age has a significantly positive relationship with support for LNP.

Unemployed

all_coefs %>% filter(term == "Unemployed") %>% 
  ggplot(aes(x=year, y=estimate)) +
  geom_point() + geom_errorbar(aes(ymin = lower95, ymax = upper95), width = 0.25) + 
  geom_hline(aes(yintercept = 0), alpha = 0.5) + 
  theme(axis.text.x = element_text(angle = 60, hjust=1, size = 12)) + lims(y=c(-8,8))

## Voting against Unemployed
data_mod %>% ggplot(aes(x = Unemployed, y = Perc_LNP)) + geom_point(aes(label = Election_Division)) + geom_smooth(method = "lm") + facet_wrap(~year)
## Warning: Ignoring unknown aesthetics: label
## Warning: Removed 9 rows containing non-finite values (stat_smooth).
## Warning: Removed 9 rows containing missing values (geom_point).

We expect there to be a strong relationship between Labor party support and high levels of unemployment, based on the raw data. The model indicates that this was true in 2001-2007, but in the previous three elections unemployment has not had a significant effect. Our initial conclusions are confounded by ManagerAdminClericalSales, MedianAge and OneParent_House. There is a negative correlation between ManagerAdminClericalSales (\(\rho = 0.513\)), meaning that areas with higher unemployment typically have less of the population working “office jobs”, hence omitting this variable causes bias in our estimated of Unemployment. There is a strong positive correlation between Unemployment and OneParent_House (\(\rho = 0.679\)), indicating that initial conclusions may be confusing these effects.

change_coef_all %>% 
  filter(term == "Unemployed") %>% 
  ggplot(aes(x=omitted_var, y=estimate)) + 
  geom_point() + geom_errorbar(aes(ymin = lower95, ymax = upper95), width = 0.25) + 
  geom_hline(aes(yintercept = full_estimate), col = "blue") + 
  geom_hline(aes(yintercept = 0), col = "grey50") + 
  facet_wrap(~year) + 
  theme(axis.text.x = element_text(angle = 60, hjust=1, size = 12)) 

cor(data_mod$OneParent_House, data_mod$Unemployed)
## [1] 0.6791213

ManagerAdminClericalSales

all_coefs %>% filter(term == "ManagerAdminClericalSales") %>% 
  ggplot(aes(x=year, y=estimate)) +
  geom_point() + geom_errorbar(aes(ymin = lower95, ymax = upper95), width = 0.25) + 
  geom_hline(aes(yintercept = 0), alpha = 0.5) + 
  theme(axis.text.x = element_text(angle = 60, hjust=1, size = 12)) + lims(y=c(-8,8))

## Voting against ManagerAdminClericalSales
data_mod %>% ggplot(aes(x = ManagerAdminClericalSales, y = Perc_LNP)) + geom_point(aes(label = Election_Division)) + geom_smooth(method = "lm") + facet_wrap(~year)
## Warning: Ignoring unknown aesthetics: label
## Warning: Removed 9 rows containing non-finite values (stat_smooth).
## Warning: Removed 9 rows containing missing values (geom_point).

Significantly positive relationship between ManagerAdminClericalSales and support for LNP across all years.

NoReligion

all_coefs %>% filter(term == "NoReligion") %>% 
  ggplot(aes(x=year, y=estimate)) +
  geom_point() + geom_errorbar(aes(ymin = lower95, ymax = upper95), width = 0.25) + 
  geom_hline(aes(yintercept = 0), alpha = 0.5) + 
  theme(axis.text.x = element_text(angle = 60, hjust=1, size = 12)) + lims(y=c(-8,8))

## Voting against NoReligion
data_mod %>% ggplot(aes(x = NoReligion, y = Perc_LNP)) + geom_point(aes(label = Election_Division)) + geom_smooth(method = "lm") + facet_wrap(~year)
## Warning: Ignoring unknown aesthetics: label
## Warning: Removed 9 rows containing non-finite values (stat_smooth).
## Warning: Removed 9 rows containing missing values (geom_point).

Negative effects estimated, but are only significant in 2010 and 2013. From the original plots we don’t expect to see any significant relationship.

change_coef_all %>% 
  filter(term == "NoReligion") %>% 
  ggplot(aes(x=omitted_var, y=estimate)) + 
  geom_point() + geom_errorbar(aes(ymin = lower95, ymax = upper95), width = 0.25) + 
  geom_hline(aes(yintercept = full_estimate), col = "blue") + 
  geom_hline(aes(yintercept = 0), col = "grey50") + 
  facet_wrap(~year) + 
  theme(axis.text.x = element_text(angle = 60, hjust=1, size = 12)) 

cor(data_mod$OneParent_House, data_mod$Unemployed)
## [1] 0.6791213

OneParent_House

all_coefs %>% filter(term == "OneParent_House") %>% 
  ggplot(aes(x=year, y=estimate)) +
  geom_point() + geom_errorbar(aes(ymin = lower95, ymax = upper95), width = 0.25) + 
  geom_hline(aes(yintercept = 0), alpha = 0.5) + 
  theme(axis.text.x = element_text(angle = 60, hjust=1, size = 12)) + lims(y=c(-8,8))

## Voting against OneParent_House
data_mod %>% ggplot(aes(x = OneParent_House, y = Perc_LNP)) + geom_point(aes(label = Election_Division)) + geom_smooth(method = "lm") + facet_wrap(~year)
## Warning: Ignoring unknown aesthetics: label
## Warning: Removed 9 rows containing non-finite values (stat_smooth).
## Warning: Removed 9 rows containing missing values (geom_point).

Significantly negative relationship observed in original plots and estimated in model.

OtherLanguageHome

all_coefs %>% filter(term == "OtherLanguageHome") %>% 
  ggplot(aes(x=year, y=estimate)) +
  geom_point() + geom_errorbar(aes(ymin = lower95, ymax = upper95), width = 0.25) + 
  geom_hline(aes(yintercept = 0), alpha = 0.5) + 
  theme(axis.text.x = element_text(angle = 60, hjust=1, size = 12)) + lims(y=c(-8,8))

## Voting against OtherLanguageHome
data_mod %>% ggplot(aes(x = OtherLanguageHome, y = Perc_LNP)) + geom_point(aes(label = Election_Division)) + geom_smooth(method = "lm") + facet_wrap(~year)
## Warning: Ignoring unknown aesthetics: label
## Warning: Removed 9 rows containing non-finite values (stat_smooth).
## Warning: Removed 9 rows containing missing values (geom_point).

Model estimates insignificant effect, whereas the raw data indicates there should be a negative relationship. We see that if we omit Born_SE_Europe, its effect is absorbed into OtherLanguageHome. This is telling us that it does not matter about the language spoken, but rather where the population is born.

change_coef_all %>% 
  filter(term == "OtherLanguageHome") %>% 
  ggplot(aes(x=omitted_var, y=estimate)) + 
  geom_point() + geom_errorbar(aes(ymin = lower95, ymax = upper95), width = 0.25) + 
  geom_hline(aes(yintercept = full_estimate), col = "blue") + 
  geom_hline(aes(yintercept = 0), col = "grey50") + 
  facet_wrap(~year) + 
  theme(axis.text.x = element_text(angle = 60, hjust=1, size = 12)) 

RentLoan

all_coefs %>% filter(term == "RentLoan") %>% 
  ggplot(aes(x=year, y=estimate)) +
  geom_point() + geom_errorbar(aes(ymin = lower95, ymax = upper95), width = 0.25) + 
  geom_hline(aes(yintercept = 0), alpha = 0.5) + 
  theme(axis.text.x = element_text(angle = 60, hjust=1, size = 12)) + lims(y=c(-8,8))

## Voting against RentLoan
data_mod %>% ggplot(aes(x = RentLoan, y = Perc_LNP)) + geom_point(aes(label = Election_Division)) + geom_smooth(method = "lm") + facet_wrap(~year)
## Warning: Ignoring unknown aesthetics: label
## Warning: Removed 9 rows containing non-finite values (stat_smooth).
## Warning: Removed 9 rows containing missing values (geom_point).

Expect insignificant effect from the raw plots. In 2010 we see significant support for LNP.

change_coef_all %>% 
  filter(term == "OtherLanguageHome") %>% 
  ggplot(aes(x=omitted_var, y=estimate)) + 
  geom_point() + geom_errorbar(aes(ymin = lower95, ymax = upper95), width = 0.25) + 
  geom_hline(aes(yintercept = full_estimate), col = "blue") + 
  geom_hline(aes(yintercept = 0), col = "grey50") + 
  facet_wrap(~year) + 
  theme(axis.text.x = element_text(angle = 60, hjust=1, size = 12)) 


Stories from the data

Education and Students

Education policy has always been key policy issue in the weeks leading up to a federal election. The Labor party, more often than not, campaign with policies that boost funding for education, and the coalition will usually propose alternatives with less funding or aim to match particular objectives - for example, the “Better Schools Plan” in 2013. It would be reasonable to expect that electorates with higher education levels, and higher student populations, would be inclined to support Labor every election, which is what we observe in our primary plots.

After adjusting for the other socio-demographics, we find that education levels are only related to support of the Labor party in the 2004 and 2007 elections, with other years being insignificant at the 5% level. If we omit the prevelence of atheism (NoReligion) from our model, the effect of education in favour of Labor increases to be significant in 2010 and 2013. The same is true for the proportion of people working in “extractive” jobs, in the same years. This indicates that our initial thoughts on the effect education levels would have is being confounded by these other factors. We are not laying any causal claims here, but rather indicating that our expectations about aggregate behaviour can be simeltaneously determined by grouping characteristics together. Essentially, our intuitions are very susceptible to omitted variable bias!

Similarly, we see higher student proportions being significantly related to increased Liberal support in 2007 and 2010. Omitting the age profile of the electorate (MedianAge) would lead to significant support for Labor in 2001, and would change our conclusions from significant to insignificant in 2007 and 2010. This is not surprising, as younger people are typically students, so higher student populations should mean lower median age, and these two are highly correlated (\(\rho = -0.683\)).

Unemployment

Our model indicates that in the 2001, 2004 and 2007 elections, higher unemployment rates were linked with more support for the Labor party. Interestingly, in all of those elections the Liberal party was the incumbent party with John Howard as Prime Minister. From 2010 onwards, education has not had a significant effect, which is not what we would expect from the raw data plots. If we omit “office jobs”, the estimated coefficient of unemployment becomes more negative, but remains insignificant in these three elections. Therefore, we conclude that the unemployment rate was a significant determinant in two-party preference for 2001, 2004 and 2007 elections, but has since become insignificant, after adjusting for other socio-demographics - contrary to expectations.

Other Language at Home

From the Tampa affair in 2001, when John Howard diverted the MV Tampa to Nauru (carrying 400 Afghani refugees), to the “stop the boats” mantra of the 2013 election, the Liberal party have consistently campaigned on policies of stricter immigration. We expect electorates with high proportions of people speaking languages other than English at home to also have high proportions of migrants (correlation \(\rho = 0.90\)), so OtherLanguageHome should have a positive relationship with voting Labor - which we see in the raw plots. After accounting for (some) birthplace demographics, we find that language at home has no significant effect on two-party preference.

load("~/Documents/R/Data/Clean/abs_all.rda")
abs_all <- abs_all %>% mutate(Born_Australia = 100 - Born_UK - Born_SE_Europe - Born_MidEast - Born_Asia - BornElsewhere) 

cor(abs_all$Born_Australia, abs_all$OtherLanguageHome)
## [1] -0.900876
cor(abs_all$Born_SE_Europe, abs_all$Born_Asia)
## [1] 0.3614971

Rental and Housing Loan Payments

In 2010, electorates with higher (median) rental prices and housing loan payments were significantly more likely to support the Liberal party. Although omitted from our model, the factor RentLoan has strong positive correlation with the Incomes factor (\(\rho = 0.83\)).

Extractive

Even in the election before the mining boom began in 2002, areas with more people working in “extractive” industries (mining, gas, water, gas, waste, electricity and agriculture) were significantly more supportive of the coalition. Over the following two elections the two parties did not differ significantly in their policies on natural resources, and we our coefficient estimates do not change much relative to 2001 - remaining significantly positive.

A resource “super profits” tax was proposed by the Labor party in the 2010 election, which would see a 40% tax on “super profits” made from “exploitation of Australia’s non-renewable resources”. This was heavily opposed by the Liberal party, and

In 2010, the Labor party proposed a 40% tax on “super profits” made from ‘the exploitation of Australia’s non-renewable resources’, which was heavily opposed by the Liberal party, who instead offered the Exploration Development Incentive. The mining sector, particularly in Western Australia, vehemently objected to Labor’s policy, with Chamber of Minerals and Energy chief executive describing it as a “\(\$9\) billion handbreak on the economy”. Consequently, we see a huge jump in the effect of extractive jobs in 2010, with estimated coefficient increasing from \(2.43\) to \(5.63\) - becoming the largest coefficient in the model. Following the end of the mining boom in 2012, the estimated coefficient drops but remains significantly positive in 2013 and 2016.